POSTAL_TO_STATE = list('AL'='Alabama', 'AK'='Alaska', 'AS'='American Samoa',
                       'AZ'='Arizona', 'AR'='Arkansas', 'CA'='California',
                       'CO'='Colorado', 'CT'='Connecticut', 'DE'='Delaware',
                       'DC'='District of Columbia', 'FL'='Florida',
                       'GA'='Georgia', 'GU'='Guam', 'HI'='Hawaii',
                       'ID'='Idaho', 'IL'='Illinois', 'IN'='Indiana',
                       'IA'='Iowa', 'KS'='Kansas', 'KY'='Kentucky',
                       'LA'='Louisiana', 'ME'='Maine', 'MD'='Maryland',
                       'MA'='Massachusetts', 'MI'='Michigan', 'MN'='Minnesota',
                       'MS'='Mississippi', 'MO'='Missouri', 'MT'='Montana',
                       'NE'='Nebraska', 'NV'='Nevada', 'NH'='New Hampshire',
                       'NJ'='New Jersey', 'NM'='New Mexico', 'NY'='New York',
                       'NC'='North Carolina', 'ND'='North Dakota',
                       'MP'='Northern Mariana Islands', 'OH'='Ohio',
                       'OK'='Oklahoma', 'OR'='Oregon', 'PA'='Pennsylvania',
                       'PR'='Puerto Rico', 'RI'='Rhode Island', 'SC'='South Carolina',
                       'SD'='South Dakota', 'TN'='Tennessee',
                       'TX'='Texas', 'UT'='Utah', 'VT'='Vermont', 'VI'='Virgin Islands',
                       'VA'='Virginia', 'WA'='Washington', 'WV'='West Virginia',
                       'WI'='Wisconsin', 'WY'='Wyoming')

states = c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga", "hi",
           "id", "il", "in", "ia", "ks", "ky", "la", "me", "md", "ma", "mi",
           "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj", "nm", "ny", "nc",
           "nd", "oh", "ok", "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut",
           "vt", "va", "wa", "wv", "wi", "wy")

BASE_DAILY_URL = paste0(
      'https://raw.githubusercontent.com/google-research/open-covid-19-data/',
      'master/data/exports/search_trends_symptoms_dataset/',
      'United%20States%20of%20America/subregions/{state}/',
      '2020_US_{state_underscore}_daily_symptoms_dataset.csv')
cache_data_list = list()
signal_description_df = tribble(
    ~signal,            ~description,
    'Podalgia',                         'pain in the foot',
    'Anosmia',                          'loss of smell',
    'Purpura',                          "red/purple skin spots; 'blood spots'",
    'Radiculopathy',                    'pinched nerve',
    'Ageusia',                          'loss of taste',
    'Erythema chronicum migrans',       'expanding rash early in lyme disease',
    'Photodermatitis',                  'allergic rash that reqs light',
)
expand_state_name = function(state) {
  state_name = POSTAL_TO_STATE[[str_to_upper(state)]]
  return(state_name)
}

load_state_data = function(state) {
  if (state %in% names(cache_data_list)) return (cache_data_list[[state]])
  # Check whether there is a cached version
  state_fname = sprintf('cache/%s.csv', state)
  # if there isn't, then download
  if (!file.exists(state_fname)) {
    state_name = expand_state_name(state)
    message(sprintf('Downloading data for %s...', state_name))
    state_name_underscore = str_replace_all(state_name, ' ', '_')
    STATE_DAILY_URL = str_replace_all(BASE_DAILY_URL,
                                   fixed('{state}'), state_name)
    STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
                                   fixed('{state_underscore}'),
                                   state_name_underscore)
    STATE_DAILY_URL = str_replace_all(STATE_DAILY_URL,
                                   fixed(' '),
                                   '%20')
    download.file(STATE_DAILY_URL, state_fname)
  }
  single_state = readr::read_csv(state_fname)
  cache_data_list[[state]] <<- single_state
  return (single_state)
}


pull_data_state = function(state, symptom) {
  single_state = load_state_data(state)
  unique(single_state$sub_region_2_code)
  single_state_counties = single_state[!is.na(single_state$sub_region_2_code),]
  selected_symptom = paste0('symptom:', symptom)
  single_state_symptom = single_state_counties[,c('sub_region_2_code',
                                                  'date',
                                                  selected_symptom)]
  # Shape into what we want
  colnames(single_state_symptom) = c('geo_value', 'time_value', 'value')
  single_state_symptom = single_state_symptom %>% filter (
      !is.na(value),
    )
  single_state_symptom = single_state_symptom %>% transmute (
      geo_value = sprintf('%05d', as.numeric(geo_value)),
      signal = symptom,
      time_value = time_value,
      direction = NA,
      issue = lubridate::today(),
      lag = issue - time_value,
      value = value,
      stderr = NA,
      sample_size = NA,
      data_source = 'google_symptoms',
    )
}

Summary

Initial ingestion and exploration

if (file.exists('symptom_df.RDS')) {
  symptom_df = readRDS('symptom_df.RDS')
  symptom_names = unique(symptom_df$signal)
} else {
  dir.create('./cache/')
  ak = load_state_data('ak')
  symptom_cols = colnames(ak)[
                    str_detect(colnames(ak), 'symptom:')]
  symptom_names = str_replace(symptom_cols, fixed('symptom:'), '')

  symptom_df_list = vector('list', length(symptom_names))
  names(symptom_df_list) = symptom_names

  for (symptom in symptom_names) {
    cat(symptom, '...\n')
    states_list = vector('list', length(states))
    for (idx in 1:length(states)) {
      state = states[idx]
      states_list[[idx]] = pull_data_state(state, symptom)
    }
    symptom_df_list[[symptom]] = bind_rows(states_list)
  }
  symptom_df = bind_rows(symptom_df_list)
  saveRDS(symptom_df, 'symptom_df.RDS')
}
start_day = "2020-03-01"
end_day = "2020-08-15"

df_inum = covidcast_signal(data_source = "jhu-csse",
                   signal = "confirmed_7dav_incidence_num",
                   start_day = start_day, end_day = end_day)

case_num = 500
geo_values = df_inum %>% group_by(geo_value) %>%
  summarize(total = sum(value)) %>%
  filter(total >= case_num) %>% pull(geo_value)
df_inum_act = df_inum %>% filter(geo_value %in% geo_values)
symptom_df_act = symptom_df %>% filter (
  geo_value %in% geo_values,
)

Here we plot the availaibility of each symptom over time (proportion is percentage of counties for which the symptom was available). We see that for each signal, the availability level is consistent over time, subject to a strong weekend effect.

availability_df = symptom_df_act %>% group_by (
  time_value,
  signal,
) %>% summarize (
  prop_available = n() / length(geo_values),
) %>% ungroup (
)
## `summarise()` regrouping output by 'time_value' (override with `.groups` argument)
plt = (ggplot(availability_df)
       + geom_line(aes(x=time_value,
                       y=prop_available,
                       group=factor(signal)),
                   color='dodgerblue4',
                   size=0.1)
       )
plt

The symptoms for which data is most sparse are:

most_missing = availability_df %>% group_by (
  signal,
) %>% summarize (
  avg_available = mean(prop_available)
) %>% ungroup (
) %>% filter (
  avg_available <= 0.05
) %>% arrange (
  avg_available,
)
## `summarise()` ungrouping output (override with `.groups` argument)
print(most_missing)
## # A tibble: 16 x 2
##    signal                   avg_available
##    <chr>                            <dbl>
##  1 Viral pneumonia                 0.0279
##  2 Aphonia                         0.0334
##  3 Crackles                        0.0353
##  4 Burning Chest Pain              0.0363
##  5 Urinary urgency                 0.0369
##  6 Polydipsia                      0.0371
##  7 Photodermatitis                 0.0380
##  8 Dysautonomia                    0.0414
##  9 Shallow breathing               0.0414
## 10 Allergic conjunctivitis         0.0435
## 11 Laryngitis                      0.0437
## 12 Ventricular fibrillation        0.0455
## 13 Angular cheilitis               0.0462
## 14 Myoclonus                       0.0462
## 15 Rectal pain                     0.0489
## 16 Stridor                         0.0491

For the signal that is most sparsely available, the number of counties at which it tends to be available daily is:

print(min(most_missing$avg_available) * length(geo_values))
## [1] 29.52653

Based on this, we leave all the symptoms in for the full correlations analysis.

Correlations

cor_list = vector('list', length(symptom_names))
names(cor_list) = symptom_names

if (file.exists('cor_df.RDS')) {
  cor_df = readRDS('cor_df.RDS')
} else {
  for (symptom in symptom_names) {
    cat(symptom, '...\n')
    df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
                            df_inum_act,
                            by = "time_value",
                            method = "spearman")
    df_cor1['signal'] = symptom
    cor_list[[symptom]] = df_cor1
  }
  cor_df = bind_rows(cor_list)
  saveRDS(cor_df, 'cor_df.RDS')
}
cor_df = cor_df %>% left_join(
  signal_description_df,
  on='signal',
)
## Joining, by = "signal"

Correlation over time: all symptoms

## Warning: Removed 7174 row(s) containing missing values (geom_path).

Correlation over time: largest single-day correlation

When we discuss the “size” of a correlation, we consider the absolute value of correlation.

top_cor_signals = plot_cor_df %>% group_by (
    signal,
  ) %>% filter (
    abs(value) == max(abs(value), na.rm=TRUE),
  ) %>% ungroup (
  ) %>% arrange(
    -abs(value),
  ) %>% head (
    5,
  )
top_cor_sum_stats = plot_cor_df %>% filter (
    signal %in% top_cor_signals$signal,
  ) %>% group_by (
    signal,
  ) %>% summarize (
    min = min(value, na.rm=TRUE),
    quart1 = quantile(value, 0.25, na.rm=TRUE),
    med = median(value, na.rm=TRUE),
    mean = mean(value, na.rm=TRUE),
    quart3 = quantile(value, 0.75, na.rm=TRUE),
    max = max(value, na.rm=TRUE),
  ) %>% ungroup (
  )
## `summarise()` ungrouping output (override with `.groups` argument)
print('Symptoms with the largest all-time correlation:')
## [1] "Symptoms with the largest all-time correlation:"
print(top_cor_signals %>% left_join(top_cor_sum_stats, on='signal')
        %>% select(-time_value, -value),
      width=100)
## Joining, by = "signal"
## # A tibble: 5 x 8
##   signal                     description                              min quart1
##   <chr>                      <chr>                                  <dbl>  <dbl>
## 1 Viral pneumonia            <NA>                                 -0.898  -0.269
## 2 Anosmia                    loss of smell                        -0.0788  0.403
## 3 Ageusia                    loss of taste                        -0.423   0.260
## 4 Erythema chronicum migrans expanding rash early in lyme disease -0.710  -0.556
## 5 Photodermatitis            allergic rash that reqs light        -0.709  -0.487
##       med    mean quart3   max
##     <dbl>   <dbl>  <dbl> <dbl>
## 1  0.0853  0.0216  0.327 0.668
## 2  0.610   0.545   0.725 0.833
## 3  0.552   0.450   0.682 0.797
## 4 -0.276  -0.325  -0.179 0.208
## 5 -0.366  -0.349  -0.211 0.184
plt = (ggplot(plot_cor_df)
       + geom_line(aes(x=time_value,
                       y=value,
                       group=factor(signal)),
                   data=plot_cor_df %>% filter (
                      !signal %in% top_cor_signals$signal
                   ),
                   color='cornsilk',
                   size=0.1,
                   alpha=1.0)
       + geom_line(aes(x=time_value,
                       y=value,
                       group=factor(signal),
                       colour=factor(signal)
                       ),
                   data=plot_cor_df %>% filter (
                      signal %in% top_cor_signals$signal,
                   ),
                   #color='darkorange',
                   size=0.3)
       + ylab('rank correlation')
       + scale_x_date(breaks=lubridate::ymd(c('2020-03-01',
            '2020-03-15', '2020-04-01', '2020-04-15', '2020-05-01',
            '2020-05-15', '2020-06-01', '2020-06-15', '2020-07-01',
            '2020-07-15', '2020-08-01', '2020-08-15')))
       + theme(axis.text.x = element_text(angle = 45))
       + ggtitle("Top 5 signals by all-time max(|corr|)")
       )
plt
## Warning: Removed 7089 row(s) containing missing values (geom_path).
## Warning: Removed 85 row(s) containing missing values (geom_path).

Correlation over time: “consistently away from zero” symptoms

## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Symptoms that consistently stay away from 0 correlation:"
## Joining, by = "signal"
## # A tibble: 5 x 8
##   signal                 description                              min quart1
##   <chr>                  <chr>                                  <dbl>  <dbl>
## 1 Podalgia               pain in the foot                     -0.423  -0.279
## 2 Restless legs syndrome <NA>                                 -0.528  -0.410
## 3 Anosmia                loss of smell                        -0.0788  0.403
## 4 Purpura                red/purple skin spots; 'blood spots' -0.533  -0.366
## 5 Radiculopathy          pinched nerve                        -0.477  -0.349
##      med   mean quart3     max
##    <dbl>  <dbl>  <dbl>   <dbl>
## 1 -0.217 -0.223 -0.166 -0.0710
## 2 -0.337 -0.310 -0.215 -0.0530
## 3  0.610  0.545  0.725  0.833 
## 4 -0.283 -0.289 -0.206 -0.0477
## 5 -0.278 -0.256 -0.154 -0.0425
## Warning: Removed 7089 row(s) containing missing values (geom_path).
## Warning: Removed 85 row(s) containing missing values (geom_path).

## [1] "Podalgia"               "Restless legs syndrome" "Anosmia"               
## [4] "Purpura"                "Radiculopathy"
## [1] "Anosmia"

Interestingly, although anosmia technically is one of the “most stable” by our criterion, its distribution actually covers zero.

Correlation across location: TODO

if (file.exists('geo_cor_df.RDS')) {
  geo_cor_df = readRDS('geo_cor_df.RDS')
} else {
  geo_cor_list = vector('list', length(symptom_names))
  names(geo_cor_list) = symptom_names

  for (symptom in symptom_names) {
    cat(symptom, '...\n')
    df_cor1 = covidcast_cor(symptom_df_act %>% filter(signal == symptom),
                            df_inum_act,
                            by = "geo_value",
                            method = "spearman")
    df_cor1['signal'] = symptom
    geo_cor_list[[symptom]] = df_cor1
  }
  geo_cor_df = bind_rows(geo_cor_list)
  saveRDS(geo_cor_df, 'geo_cor_df.RDS')
}
geo_cor_df = geo_cor_df %>% left_join(
  signal_description_df,
  on='signal',
)
## Joining, by = "signal"
for (symptom in top_cor_signals$signal) {
  df_cor2 = geo_cor_df %>% filter (signal == symptom)
  df_cor2$time_value = min_available_time
  df_cor2$issue = min_available_time
  attributes(df_cor2)$geo_type = 'county'
  class(df_cor2) = c("covidcast_signal", "data.frame")
  n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()

  # Plot choropleth maps, using the covidcast plotting functionality
  title_text = sprintf("Correlations between cases and %s (%d counties)",
                             symptom, n_available_county)
  if (!is.na(df_cor2$description[1])) {
    title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
  } 
  print(plot(df_cor2,
             title = title_text,
            range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}

for (symptom in top_min_cor$signal) {
  df_cor2 = geo_cor_df %>% filter (signal == symptom)
  df_cor2$time_value = min_available_time
  df_cor2$issue = min_available_time
  attributes(df_cor2)$geo_type = 'county'
  class(df_cor2) = c("covidcast_signal", "data.frame")
  n_available_county = df_cor2 %>% filter (!is.na(value)) %>% nrow()

  # Plot choropleth maps, using the covidcast plotting functionality
  title_text = sprintf("Correlations between cases and %s (%d counties)",
                             symptom, n_available_county)
  if (!is.na(df_cor2$description[1])) {
    title_text = paste0(title_text, '\n', sprintf('(%s)', df_cor2$description[1]))
  } 
  print(plot(df_cor2,
             title = title_text,
            range = c(-1, 1), choro_col = c("orange","lightblue", "purple")))
}

TODO:

Appendix

Some things I’m still worried about / next steps: